import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, spearmanr
import scipy.sparse as sparse

import json
import pickle

from scipy.stats import entropy
from numpy.linalg import norm

plt.rcParams["figure.figsize"] = (10,7)

def JSD(P, Q):
    _P = P / norm(P, ord=1)
    _Q = Q / norm(Q, ord=1)
    _M = 0.5 * (_P + _Q)
    return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))

#load the intermediate data file with legislator TBIP scores (inside the subdirectory with code to obtain TBIP scores)
final_df = pd.read_csv('final_codebase_for_creation_of_main_legislator_info_and_tbip_file/legislator_info_and_tbip_congresses_115_and_116.csv')

bid_to_speech_tbip = dict(zip(final_df[~pd.isnull(final_df['TBIP_Floor_Speeches'])]['Bioguide_ID'],
                              final_df[~pd.isnull(final_df['TBIP_Floor_Speeches'])]['TBIP_Floor_Speeches']))

bid_to_tweet_tbip = dict(zip(final_df[~pd.isnull(final_df['TBIP_Tweets'])]['Bioguide_ID'],
                              final_df[~pd.isnull(final_df['TBIP_Tweets'])]['TBIP_Tweets']))

topic_cols = ['Topic0', 'Topic1', 'Topic2', 'Topic3', 'Topic4',
       'Topic5', 'Topic6', 'Topic7', 'Topic8', 'Topic9', 'Topic10', 'Topic11',
       'Topic12', 'Topic13', 'Topic14', 'Topic15', 'Topic16', 'Topic17',
       'Topic18', 'Topic19', 'Topic20', 'Topic21', 'Topic22', 'Topic23',
       'Topic24', 'Topic25', 'Topic26', 'Topic27', 'Topic28', 'Topic29',
       'Topic30', 'Topic31', 'Topic32', 'Topic33', 'Topic34', 'Topic35',
       'Topic36', 'Topic37', 'Topic38', 'Topic39', 'Topic40', 'Topic41',
       'Topic42', 'Topic43', 'Topic44', 'Topic45', 'Topic46', 'Topic47',
       'Topic48', 'Topic49']

#load the file with floor speech topic proportions per legislator (inside the subdirectory with code to obtain TBIP scores)
speech_topic_per_author = pd.read_csv('final_codebase_for_creation_of_main_legislator_info_and_tbip_file/speeches_results/topic_proportions_per_author.csv')
speech_topic_per_author = speech_topic_per_author[speech_topic_per_author['Bioguide_ID'].isin(bid_to_speech_tbip.keys())]


speech_topic_dist = np.array(speech_topic_per_author[topic_cols])#.shape
bids_speech = list(speech_topic_per_author['Bioguide_ID'])

speech_pairwise_topic_dists = []
speech_pairwise_abs_diff_ideal_points = []
for i in range(len(bids_speech) - 1):
    for j in range(i+1, len(bids_speech)):
        bid1 = bids_speech[i]
        bid2 = bids_speech[j]
        speech_pairwise_abs_diff_ideal_points.append(abs(bid_to_speech_tbip[bid1] - bid_to_speech_tbip[bid2]))
        speech_pairwise_topic_dists.append(JSD(speech_topic_dist[i, :],
                                               speech_topic_dist[j, :]))
        
#load the file with tweets topic proportions per legislator (inside the subdirectory with code to obtain TBIP scores)
tweet_topic_per_author = pd.read_csv('final_codebase_for_creation_of_main_legislator_info_and_tbip_file/tweets_results/topic_proportions_per_author.csv')
tweet_topic_per_author = tweet_topic_per_author[tweet_topic_per_author['Bioguide_ID'].isin(bid_to_tweet_tbip.keys())]

tweet_topic_dist = np.array(tweet_topic_per_author[topic_cols])#.shape
bids_tweet = list(tweet_topic_per_author['Bioguide_ID'])

tweet_pairwise_topic_dists = []
tweet_pairwise_abs_diff_ideal_points = []
for i in range(len(bids_tweet) - 1):
    for j in range(i+1, len(bids_tweet)):
        bid1 = bids_tweet[i]
        bid2 = bids_tweet[j]
        tweet_pairwise_abs_diff_ideal_points.append(abs(bid_to_tweet_tbip[bid1] - bid_to_tweet_tbip[bid2]))
        tweet_pairwise_topic_dists.append(JSD(tweet_topic_dist[i, :],
                                              tweet_topic_dist[j, :]))


# PRINTS correlation scores for floor speeches; produce scatter plot (Figure A6)
print('Spearman Correlation = ' + str(round(spearmanr(speech_pairwise_topic_dists, speech_pairwise_abs_diff_ideal_points)[0], 3)))
print('Pearson R Correlation = ' + str(round(pearsonr(speech_pairwise_topic_dists, speech_pairwise_abs_diff_ideal_points)[0], 3)))

plt.scatter(speech_pairwise_topic_dists,
            speech_pairwise_abs_diff_ideal_points,
            s=5,
            color = 'purple')
plt.xlabel('Pairwise JS Divergence\nbetween\nTopic Distributions')
plt.ylabel('Pairwise Absolute Difference\nbetween \nIdeal Point Values')
plt.title('Speech-based TBIP')
#plt.show()

# THIS SAVES Figure A6: 
plt.savefig('pairwise_scatterplot_speech_topic_dist_vs_ideal_point_diff.png',
           facecolor='white',
            transparent=False)


# PRINTS correlation scores for tweets; produce scatter plot (Figure A7)
print('Spearman Correlation = ' + str(round(spearmanr(tweet_pairwise_topic_dists,
                                                      tweet_pairwise_abs_diff_ideal_points)[0], 3)))
print('Pearson R Correlation = ' + str(round(pearsonr(tweet_pairwise_topic_dists,
                                                      tweet_pairwise_abs_diff_ideal_points)[0], 3)))

plt.scatter(tweet_pairwise_topic_dists,
            tweet_pairwise_abs_diff_ideal_points,
            s=5,
            color = 'maroon')
plt.xlabel('Pairwise JS Divergence\nbetween\nTopic Distributions')
plt.ylabel('Pairwise Absolute Difference\nbetween \nIdeal Point Values')
plt.title('Tweet-based TBIP')
#plt.show()

# THIS SAVES Figure A7: 
plt.savefig('pairwise_scatterplot_tweet_topic_dist_vs_ideal_point_diff.png',
           facecolor='white',
            transparent=False)
